###############################################                                     
# Cause of Effect? Turnout in Hispanic Majority-Minority Districts
#  - John A. Henderson, Jasjeet S. Sekhon, and Rocio Titiunik
#  - Forthcoming in Political Analysis
#  - Replication file for <Table I>
#  - April 14, 2016
###############################################

##################
# Main Balance Table on the New Match Runs + Pre-trimmed unmatched 
##################

options(width=150)
rm(list=ls())   

path='/local/'  

source(paste(path,'replicationPA/funs/headers.R',sep=''))
    
set.seed(57459)
     
#load('~/Desktop/Research/Projects/STHEthnicity/mergedData2013/dataCA-1998-2006-recreated.Rdata')                

#WH2data <- read.csv(file = paste(path,"Baseline-afterMATCH.csv",sep=''), header = TRUE, stringsAsFactors = FALSE)
#WH2data <- read.dta(file = paste(path,"replicationPA/data/Baseline-afterMATCH-recreated.dta",sep=''))          
load(file = paste(path,"replicationPA/data/Baseline-afterMATCH-recreated.Rdata",sep=''))          
dim(WH2data)  
                                         
WH2data=WH2data[which(WH2data$dfound_pair_newdata==1),]  
dim(WH2data)

#Sdata <-  read.dta(file = paste(path,"replicationPA/data/Matched-recreated.dta",sep=''))
load(file = paste(path,"replicationPA/data/Matched-recreated.Rdata",sep=''))   
dim(Sdata)
 
Sdata=Sdata[which(Sdata$dfound_pair_newdata==1),]
dim(Sdata)


WH2data$hvap=WH2data$pop_hispanic18 
WH2data$nhvap=WH2data$vap-WH2data$hvap

WH2data$phh_income99_39less  = WH2data$phh_income99_0to19    + WH2data$phh_income99_20to39
WH2data$phh_income99_40to74  = WH2data$phh_income99_40to59   + WH2data$phh_income99_60to74
WH2data$phh_income99_100plus = WH2data$phh_income99_100to199 + WH2data$phh_income99_200
WH2data$ppop_foreign         = WH2data$ppop_foreign_naturcit + WH2data$ppop_foreign_nocit

WH2data$y98_reg_nhisp		 = WH2data$y98_reg_tot - WH2data$y98_reg_hisp 
WH2data$y98_preg_nhisp		 = WH2data$y98_reg_nhisp/WH2data$y98_reg_tot
       
WH2data$y00_turn_nhisp       = WH2data$y00_turn_tot - WH2data$y00_turn_hisp
WH2data$y00_reg_nhisp        = WH2data$y00_reg_tot - WH2data$y00_reg_hisp

WH2data$y02_turn_nhisp       = WH2data$y02_turn_tot - WH2data$y02_turn_hisp
WH2data$y02_reg_nhisp        = WH2data$y02_reg_tot  - WH2data$y02_reg_hisp                   

WH2data$y04_turn_nhisp       = WH2data$y04_turn_tot - WH2data$y04_turn_hisp
WH2data$y04_reg_nhisp        = WH2data$y04_reg_tot  - WH2data$y04_reg_hisp                                                                           
                                                                          
WH2data$y06_turn_nhisp       = WH2data$y06_turn_tot - WH2data$y06_turn_hisp
WH2data$y06_reg_nhisp        = WH2data$y06_reg_tot  - WH2data$y06_reg_hisp
       

WH2data$y00_preg_rep = (WH2data$y98_preg_dem+WH2data$y98_preg_rep) - WH2data$y00_preg_dem                                                                 
WH2data$y00_preg_tot = WH2data$y00_reg_tot/WH2data$vap   
#WH2data$y00_preg_tot = WH2data$y00_reg_tot/Sdata$pop_total  
WH2data$y00_preg_tot[WH2data$y00_preg_tot>1]=1

Sdata$y00_preg_rep = (Sdata$y98_preg_dem+Sdata$y98_preg_rep) - Sdata$y00_preg_dem 
Sdata$y00_preg_tot = Sdata$y00_reg_tot/Sdata$vap              
#Sdata$y00_preg_tot = Sdata$y00_reg_tot/Sdata$pop_total
Sdata$y00_preg_tot[Sdata$y00_preg_tot>1]=1

Sdata$hvap=Sdata$pop_hispanic18 
Sdata$nhvap=Sdata$vap-Sdata$hvap  

Sdata$phh_income99_39less  = Sdata$phh_income99_0to19    + Sdata$phh_income99_20to39
Sdata$phh_income99_40to74  = Sdata$phh_income99_40to59   + Sdata$phh_income99_60to74
Sdata$phh_income99_100plus = Sdata$phh_income99_100to199 + Sdata$phh_income99_200
Sdata$ppop_foreign         = Sdata$ppop_foreign_naturcit + Sdata$ppop_foreign_nocit

Sdata$y98_reg_nhisp			 = Sdata$y98_reg_tot - Sdata$y98_reg_hisp
Sdata$y98_preg_nhisp		 = Sdata$y98_reg_nhisp/Sdata$y98_reg_tot

Sdata$y00_turn_nhisp       = Sdata$y00_turn_tot - Sdata$y00_turn_hisp
Sdata$y00_reg_nhisp        = Sdata$y00_reg_tot - Sdata$y00_reg_hisp

Sdata$y02_turn_nhisp       = Sdata$y02_turn_tot - Sdata$y02_turn_hisp
Sdata$y02_reg_nhisp        = Sdata$y02_reg_tot  - Sdata$y02_reg_hisp                   

Sdata$y04_turn_nhisp       = Sdata$y04_turn_tot - Sdata$y04_turn_hisp
Sdata$y04_reg_nhisp        = Sdata$y04_reg_tot  - Sdata$y04_reg_hisp                                                                           
                                                                          
Sdata$y06_turn_nhisp       = Sdata$y06_turn_tot - Sdata$y06_turn_hisp
Sdata$y06_reg_nhisp        = Sdata$y06_reg_tot  - Sdata$y06_reg_hisp         

names(Sdata)[which(names(Sdata)=='tr')]='Tr'
names(WH2data)[which(names(WH2data)=='tr')]='Tr'


# balance covariates

# Conditioning/Balance set: group most important variables first
# HERE
imp.vars <- data.frame(
Sdata$vap,
Sdata$ppop_black18,
Sdata$ppop_hispanic18,
Sdata$phh_income99_39less,
Sdata$phh_income99_40to74,   
#Sdata$phh_income99_75to99,
Sdata$phh_income99_100plus,                   
Sdata$ppop_25_hsless,
Sdata$ppop_foreign, 
Sdata$ppop_foreign_naturcit,
Sdata$ppop_foreign_nocit
)               
                   
      


# Registration variables: include only 1998 & 2000
reg.vars <- data.frame(
Sdata$y98_preg_tot,                                              
Sdata$y98_preg_hisp, # no turnout info for 1998, so this is the closest we have to previous outcome
#Sdata$y98_preg_nhisp,
Sdata$y98_preg_dem,
Sdata$y98_preg_rep,
Sdata$y00_preg_tot,                                              
Sdata$y00_preg_dem,
Sdata$y00_preg_rep                      
)
dim(reg.vars)
cat("Final dimension of reg.vars: ", dim(reg.vars), "\n")

# Vote variables

vote.vars <- data.frame(
Sdata$y98_pvote_ussdem,        # statewide and local offices
Sdata$y98_pvote_govdem,
Sdata$y98_pvote_cngdem,                        
#Sdata$y98_pvote_assdem,
#Sdata$y98_pvote_atgdem 
Sdata$y00_vote_pprsdem,
Sdata$y00_pvote_ussdem,
Sdata$y00_pvote_cngdem
)

dim(vote.vars)
cat("Final dimension of vote.vars: ", dim(vote.vars), "\n")

# factorize triplets for balance tests
un_trip=unique(Sdata$DisTri_1991)
triplet=matrix(0,dim(Sdata)[1],length(un_trip))  
for(j in 1:length(un_trip)){
	indx=which(Sdata$DisTri_1991==un_trip[j])
	if(length(indx)>0){
		triplet[indx,j]=1  
	}
}
triplet=as.data.frame(triplet)
colnames=names(triplet)=as.character(un_trip)           

# Population variables
pop.vars <- data.frame(
Sdata$ppop_fem,
Sdata$ppop_25to44,
Sdata$ppop_45to59
#Sdata$ppop_70older
#as.factor(Sdata$DisTri_1991)
#triplet
)
cat("Final dimension of pop.vars: ", dim(pop.vars), "\n")


SXall <- data.frame (imp.vars,reg.vars, vote.vars, pop.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#X <- data.frame (imp.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#B <- X
dim(SXall)
   
# Conditioning/Balance set: group most important variables first
# WHdata 
        
# factorize triplets for balance tests
un_trip=unique(WH2data$DisTri_1991)
triplet=matrix(0,dim(WH2data)[1],length(un_trip))  
for(j in 1:length(un_trip)){
	indx=which(WH2data$DisTri_1991==un_trip[j])
	if(length(indx)>0){
		triplet[indx,j]=1  
	}
}
triplet=as.data.frame(triplet)
colnames=names(triplet)=as.character(un_trip)


imp.vars <- data.frame(
WH2data$vap,
WH2data$ppop_black18,
WH2data$ppop_hispanic18,
WH2data$phh_income99_39less,
WH2data$phh_income99_40to74,  
#WH2data$phh_income99_75to99,
WH2data$phh_income99_100plus,                   
WH2data$ppop_25_hsless,
WH2data$ppop_foreign, 
WH2data$ppop_foreign_naturcit,
WH2data$ppop_foreign_nocit
)               

# Registration variables: include only 1998
reg.vars <- data.frame(
WH2data$y98_preg_tot,                                              
WH2data$y98_preg_hisp, # no turnout info for 1998, so this is the closest we have to previous outcome
#WH2data$y98_preg_nhisp,
WH2data$y98_preg_dem,
WH2data$y98_preg_rep,
WH2data$y00_preg_tot,                                              
WH2data$y00_preg_dem,
WH2data$y00_preg_rep                       
)
dim(reg.vars)
cat("Final dimension of reg.vars: ", dim(reg.vars), "\n")

# Vote variables

vote.vars <- data.frame(
WH2data$y98_pvote_ussdem,        # statewide and local offices
WH2data$y98_pvote_govdem,
WH2data$y98_pvote_cngdem,                        
#WH2data$y98_pvote_assdem,
#WH2data$y98_pvote_atgdem 
WH2data$y00_vote_pprsdem,
WH2data$y00_pvote_ussdem,
WH2data$y00_pvote_cngdem
)


dim(vote.vars)
cat("Final dimension of vote.vars: ", dim(vote.vars), "\n")

# Population variables
pop.vars <- data.frame(
WH2data$ppop_fem,
WH2data$ppop_25to44,
WH2data$ppop_45to59
#WH2data$ppop_70older
#as.factor(WH2data$DisTri_1991)
#triplet
)
cat("Final dimension of pop.vars: ", dim(pop.vars), "\n")

WXall <- data.frame (imp.vars,reg.vars, vote.vars, pop.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#X <- data.frame (imp.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#B <- X
dim(WXall)

  

#####################################################################################
## Generate balance matrices for WH1data and Wh2data
#####################################################################################
#source("generate-balance-matrices.R")

#####################################################################################
## Generate turnout and registration rates using different denominators for Sdata, WH2data and WH1data
#####################################################################################
#source("generate-turnout-measures.R")

######################################################

# Run GenMatchStack to get matched dataset 

######################################################
  
######################################################
### Results matrices
######################################################
BALrownm <- colnames(WXall)
BALcolnm1 <- c("Mean Tr WH2-mat","Mean Co WH2-mat","Diff means p-val WH2-mat","KS test p-val WH2-mat")
BALcolnm2 <- c("Mean Tr AM","Mean Co AM","Diff means p-val AM","KS test p-val AM")

RESBAL1 <- matrix(data=NA,nrow=length(BALrownm), ncol=length(BALcolnm1), dimnames = list(BALrownm,BALcolnm1))
RESBAL2 <- matrix(data=NA,nrow=length(BALrownm), ncol=length(BALcolnm2), dimnames = list(BALrownm,BALcolnm2))


######################
# Sample sizes
#####################
dim(WH2data)
table(WH2data$Tr)

dim(Sdata)
table(Sdata$Tr)


######################################################
# load indices to fix measurement bias problem
######################################################

#load('~/share/ethnicity/California/final-runs/qqmatch_index-PPOP18_Final.Rdata')

######################################################
### Balance tests
######################################################

nboots = 0
#$ks$ks$p.value
#$ks$ks.boot.pvalue

########################
## before matching: 
#######################

Tr <- WH2data$Tr    
m1=c()
m1$index.treated = which(Tr==TRUE)
m1$index.control = which(Tr==FALSE)

TrWH2 <- Tr[c(m1$index.treated,m1$index.control)]
XWH2all_base <- WXall[c(m1$index.treated,m1$index.control),]

mb <- MatchBalance(TrWH2 ~ as.matrix(XWH2all_base), ks=TRUE, nboots=nboots)

for(i in 1:length(BALrownm)) {
   varnm = BALrownm[i]
   RESBAL1[varnm,c("Mean Tr WH2-mat","Mean Co WH2-mat","Diff means p-val WH2-mat", "KS test p-val WH2-mat")] <-
   c(mb$BeforeMatching[[i]]$mean.Tr,mb$BeforeMatching[[i]]$mean.Co,mb$BeforeMatching[[i]]$p.value,mb$BeforeMatching[[i]]$ks$ks$p.value)
}
    

########################
## after matching: 
#######################


Tr <- Sdata$Tr
m3=c()
m3$index.treated = which(Tr==TRUE)
m3$index.control = which(Tr==FALSE) 

TrS <- Tr[c(m3$index.treated,m3$index.control)]
Xall_base <- SXall[c(m3$index.treated,m3$index.control),]

mb <- MatchBalance(TrS ~ as.matrix(Xall_base), ks=TRUE, nboots=nboots)

for(i in 1:length(BALrownm)) {
   varnm = BALrownm[i]
	RESBAL2[varnm,c("Mean Tr AM","Mean Co AM","Diff means p-val AM", "KS test p-val AM")] <-   
   c(mb$BeforeMatching[[i]]$mean.Tr,mb$BeforeMatching[[i]]$mean.Co,mb$BeforeMatching[[i]]$p.value,mb$BeforeMatching[[i]]$ks$ks$p.value)
}


######################################################
### Print and convert to latex
######################################################
print(RESBAL1)
#print(xtable(round(RESBAL1,digits=4)), type="latex", file="tables-WH-RESBAL1-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

print(RESBAL2)
#print(xtable(round(RESBAL2,digits=4)), type="latex", file="tables-WH-RESBAL2-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

#print(RESBAL3)
#print(xtable(round(RESBAL3,digits=4)), type="latex", file="tables-WH-RESBAL3-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

#print(RESBAL4)
#print(xtable(round(RESBAL4,digits=4)), type="latex", file="tables-WH-RESBAL4-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

#print(RESBAL5)
#print(xtable(round(RESBAL5,digits=4)), type="latex", file="tables-WH-RESBAL5-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)
                  

####### END HERE